1 Introduction

1.1 Birth Seasonality

It has been well documented that live birth follows a seasonal pattern with more babies born in Spring/Summer in most countries of the northen Hemisphere.

We downloaded monthly live birth data from the UN [http://data.un.org/Data.aspx?d=POP&f=tableCode%3A55]

birth = read.csv(paste0(IO$birth_records_dir,"UNdata_Export_20180713_023958101_all.csv"), header = TRUE)

# removing the rows with the total number of birth - only keeping the monthly data
months = format(ISOdate(2004,1:12,1),"%B")
j = birth$Month %in% months
birth = birth[which(j),]
rm(j)

# proper date
birth$date = as.Date(paste0(birth$Year, "-",birth$Month,"-01"), format = c("%Y-%B-%d"))
range(birth$date)
## [1] "1967-01-01" "2017-12-01"
# matching country names with countriesLow Names
birth$Country.or.Area = gsub("United States of America","United States", birth$Country.or.Area)

# getting geo-location
mat = match(birth$Country.or.Area, countriesLow$NAME)

birth$lat = countriesLow$LAT[mat]
birth$lon = countriesLow$LON[mat]
rm(mat)

# sorting countries by latitude

countries.levels = unique(birth$Country.or.Area)
m = match(countries.levels,countriesLow$NAME )
lat = countriesLow$LAT[m]
o = order(lat, decreasing = TRUE)
countries.levels = countries.levels[o]

birth$Country.or.Area = factor(birth$Country.or.Area, levels = countries.levels)

save(birth, file = paste0(IO$out_Rdata,"UN_birth_data.Rdata"))

rm(countries.levels, m, lat, o)
# "Australia", "New Zealand","Bermuda", "Greenland", "Guadeloupe",
countries = c("Sweden","Denmark","Finland","Canada","Germany", "France", "Switzerland", "United States","Israel","Japan","Guatemala","American Samoa","New Caledonia","Seychelles","Chile")
# order countries by decreasing latitude
mat = match(countries, countriesLow$NAME)
lat = countriesLow$LAT[mat]
o = order(lat, decreasing = TRUE)
countries = countries[o]
rm(o, lat, mat)

# select countries. Selected countries had long term data and included those from Northern and Southern hemisphere
c = which(birth$Country.or.Area %in% countries)


g = ggplot(birth[c,], aes(x = date, y = Value, col = Country.or.Area))
g + geom_line() + ggtitle("Absolute number of live birth for selected countries")

g = ggplot(birth[c,], aes(x = date, y = Value, col = Country.or.Area))
g + geom_line() + facet_grid( Country.or.Area ~ . , scales = "free_y") + 
  theme(strip.text.y = element_text(angle = 90))  + 
  ggtitle("Number of live birth for selected countries /!\ y-scale does not include 0")

countries2 = countries[-which(countries %in% c("Guatemala","Switzerland","Finland"))]

cc = which((birth$Country.or.Area %in% countries2) & (birth$Year >= 2000))


g = ggplot(birth[cc,], aes(x = date, y = Value, col = Country.or.Area))
g + geom_line() + facet_grid( Country.or.Area ~ . , scales = "free_y") + 
  theme(strip.text.y = element_text(angle = 0,hjust = 0),
        axis.text.y = element_blank(),
        axis.ticks.y = element_blank())+ #expand_limits(y=25)+
  guides(col = FALSE)+ ylab("Number of live births (min to max values)")+
  ggtitle("Number of live birth for selected countries in the last 2 decades")

long_ago = 1978:1981

cc = which((birth$Country.or.Area %in% c("Finland","France","United States", "Israel")) & (birth$Year %in% long_ago))


g = ggplot(birth[cc,], aes(x = date, y = Value, col = Country.or.Area))
g + geom_line() + facet_grid( Country.or.Area ~ . , scales = "free_y") + 
  theme(strip.text.y = element_text(angle = 90)) + 
  ggtitle("Number of live birth for selected countries in the late 70's")

cc = which((birth$Country.or.Area %in% countries) & (birth$Year %in% long_ago))

g = ggplot(birth[cc,], aes(x = date, y = Value, col = Country.or.Area))
g + geom_line() + facet_grid( Country.or.Area ~ . , scales = "free_y") + 
  theme(strip.text.y = element_text(angle = 90)) + 
  ggtitle("Number of live birth for selected countries in the late 70's")

recently = 2011:2013

cc = which((birth$Country.or.Area %in% c("Finland","France","United States", "Israel")) & (birth$Year %in% recently))


g = ggplot(birth[cc,], aes(x = date, y = Value, col = Country.or.Area))
g + geom_line() + facet_grid( Country.or.Area ~ . , scales = "free_y") + 
  theme(strip.text.y = element_text(angle = 90)) + 
  ggtitle("Number of live birth for selected countries in recent years")

sh = which(birth$lat < 0)

g = ggplot(birth[sh,], aes(x = date, y = Value, col = Country.or.Area))
g + geom_line() + facet_grid( Country.or.Area ~ . , scales = "free_y") + 
  theme(strip.text.y = element_text(angle = 90)) + 
  ggtitle("Number of live birth for selected countries in the SOUTH hemisphere")

rm(sh)

Martinez-Bakker et al., 2014 have shown that the peak time of birth follows a pattern that depends on the latitude of the considered country.

To better understand this relationship and to later investigate the potential origin of birth seasonality, we do a rhythm analysis of the birth patterns to determine the phase (= the peak time) and the relative amplitude of the birth signals, taken year by year.

BR = data.frame()

for(country in unique(birth$Country.or.Area)){
  
  c = which(birth$Country.or.Area == country)
  b = birth[c,]
  years = table(b$Year)
  years = names(years)[years == 12]
  
  for(y in years){
    
    j = which(b$Year == y)
    
    pft = periodic.fisher.test(t = 1:12, x = b$Value[j], p = 12)
    
    line = data.frame(country = country, year = as.numeric(y), pval = pft$pval , phase = pft$phase, rel.ampl = pft$rel.amp)
    
    BR = rbind(BR, line)
    
  }
}

rm(line,c, b, j,y, country, pft, years)
hist(BR$pval, breaks = 50)

# we expect p-value to be small if there is rhythmicity
BR$qval = p.adjust(BR$pval, method = "BH")
# and plot the phase (in month), show the p-value and order by latitude. 

m = match(BR$country, countriesLow$NAME)
BR$lat = countriesLow$LAT[m]
BR$lon = countriesLow$LON[m]
BR$significant_rhythm = BR$qval<=0.1
BR$month = BR$phase
BR$latitude = BR$lat

rm(m)

g = ggplot(BR, aes(x = phase, y=  lat, col = year, alpha = (qval<=0.1)/2+0.3))
g + geom_point() + xlim(c(0,12)) + scale_x_continuous(breaks = 0:12) +
  ggtitle("Peak month of birth by latitude and years")
## Scale for 'x' is already present. Adding another scale for 'x', which
## will replace the existing scale.
## Warning: Removed 658 rows containing missing values (geom_point).

g = ggplot(BR, aes(x = phase, y=  lat, col = year, alpha = qval))
g + geom_point() + xlim(c(0,12)) + scale_x_continuous(breaks = 0:12) +
  scale_alpha(range = c(0.1,1))
## Scale for 'x' is already present. Adding another scale for 'x', which
## will replace the existing scale.
## Warning: Removed 658 rows containing missing values (geom_point).

  ggtitle("Peak month of birth by latitude and years")
## $title
## [1] "Peak month of birth by latitude and years"
## 
## attr(,"class")
## [1] "labels"
g = ggplot(BR, aes(x = month, y=  latitude, col = year, alpha = significant_rhythm))
g + geom_point(size = 0.75) + #xlim(c(0,12)) +
  geom_hline(yintercept = 0)+
  scale_x_continuous(breaks = 0:12) +
  scale_alpha_discrete(range = c(0.2,1))+
  scale_color_gradient(low = "red4", high = "coral1")+
  ggtitle("Peak month of birth by latitude and years")
## Warning: Using alpha for a discrete variable is not advised.

## Warning: Removed 658 rows containing missing values (geom_point).

#mapw = map_data("world")

#g = ggplot()
#g + geom_polygon(data = mapw, aes(x=long, y = lat, group = group), fill=  NA, col = "gray") +
#  geom_point(data =BR, aes(x = lon + year-2000, y = lat, col = phase, alpha = (qval<=0.1)/2+0.3)) 
# non-transparent means significant. Phase is the month number
#rm(mapw)

We indeed observe this latitude-dependency for the peak time of birth. Interestingly, for some countries in the Nothern hemisphere, we also see a shift towards later later peak time in recent years.

Having a closer look to some countries with measurements for many years:

c = which(BR$country %in% countries)

g = ggplot(BR[c,], aes(x = phase, y = year))
g + geom_point(aes(col = (qval <= 0.1))) + 
  facet_grid( country ~ .) + 
  theme_minimal() +
  theme(strip.text.y = element_text(angle = 0))+
  ggtitle("Shift in birth month since the 70's for many countries")

# note that for some countries, the trend for the phase does change over the years

subset_countries =  c("Sweden","Denmark","Finland","Canada","Germany", "France", "Switzerland", "United States","Israel","Japan")

g = ggplot(BR[BR$country %in% subset_countries, ], aes(x = phase, y = year, col = country))
g + geom_point(aes(alpha = (qval <= 0.1)/2+0.5)) +
    geom_path(data = BR[(BR$country %in% subset_countries) & (BR$qval <= 0.1),], aes(x = phase, y = year, col = country)) +
  xlim(c(0,12))+
  theme_minimal() +
  theme(strip.text.y = element_text(angle = 0))+
  ggtitle("Except for the US and Israel, most countries have seen their peak month of birth being delayed")

??? Maybe should first de-trend the data before computing the phase? But it does not seem to correlate with the overall trends…

We were then wondering if the amplitude of the oscillations were also dependent on latitude.

g = ggplot(BR, aes(x =  rel.ampl , y =  lat, col = year, alpha = (qval<=0.1)/2+0.3))
g + geom_point() + ggtitle("Relative amplitude of birth seasonality by latitude")
## Warning: Removed 658 rows containing missing values (geom_point).

It does not seem to be the case.

1.1.1 Birth seasonality - Summary

  • Phase ( = peak month) of birth seasonality depends on latitude.
  • Amplitude (~ fold change) of birth seasonality depends on age of mothers.

In EU and US (which is the main population of the two apps) in recent years, the phases are quite similar (despite the relatively large variation in latitude), we can thus use Sympto and Kindara’s dataset to explore the origin of seasonality: i.e. is it a change in sexual behavior or a change in fertility?

countries.s = c("Germany", "Switzerland", "France","United States")


g = ggplot(BR[BR$country %in% countries.s,], aes(x = phase, y = year, col = country))
g + geom_point() + xlim(c(0,12)) + geom_hline(yintercept = 2010) +
  ggtitle("Countries for which we have users in Sympto's dataset have similar peak month in the last decade")

The peak birth month for these countries in recent years are July-August. The estimated time of conception is thus ~ November.

For Sympto, we have both location and user’s birth year (age). For Kindara, we currently do not have access to this information. Unfortunately, Sympto’s dataset is a lot smaller than Kindara’s dataset.

1.2 Menstrual-cycle & fertility tracking apps

Clue, Kindara, Sympto

2 Methods

2.1 Birth Data processing

2.1.1 UN data

We downloaded monthly live birth data from the UN [http://data.un.org/Data.aspx?d=POP&f=tableCode%3A55]

UN_birth = read.csv(paste0(IO$birth_records_dir,"UNdata_Export_20180713_023958101_all.csv"), header = TRUE)

# removing the rows with the total number of birth - only keeping the monthly data
months = format(ISOdate(2004,1:12,1),"%B")
UN_birth = UN_birth %>% dplyr::filter(Month %in% months)

# proper date
UN_birth$date = as.Date(str_c(UN_birth$Year, "-",UN_birth$Month,"-01"), format = c("%Y-%B-%d"))
range(UN_birth$date)
## [1] "1967-01-01" "2017-12-01"
# matching country names with countriesLow Names
UN_birth$Country.or.Area = gsub("United States of America","United States", UN_birth$Country.or.Area)
UN_birth$Country.or.Area = gsub("United Kingdom of Great Britain and Northern Ireland","United Kingdom", UN_birth$Country.or.Area)


# getting geo-location
mat = match(UN_birth$Country.or.Area, countriesLow$NAME)

UN_birth$lat = countriesLow$LAT[mat]
UN_birth$lon = countriesLow$LON[mat]
rm(mat)

# sorting countries by latitude
countries.levels = unique(UN_birth$Country.or.Area)
m = match(countries.levels,countriesLow$NAME )
lat = countriesLow$LAT[m]
o = order(lat, decreasing = TRUE)
countries.levels = countries.levels[o]

UN_birth$Country.or.Area = factor(UN_birth$Country.or.Area, levels = countries.levels)


# Data source
UN_birth$source = "UN"

save(UN_birth, file = paste0(IO$out_Rdata,"UN_birth_data.Rdata"))

rm(countries.levels, m, lat, o)

2.1.2 Brasil data

The birth data were downloaded on Feb 21, 2018 from [http://svs.aids.gov.br/dantps/centrais-de-conteudos/infograficos/natalidade/]

BR_birth = read_csv(file = str_c(IO$r_Data,"Brazil_birth_data/","monthly_births_Brazil_detrended_2000_2017_wavelet_gamm_timing.csv"))
## Parsed with column specification:
## cols(
##   year = col_double(),
##   location = col_character(),
##   month = col_character(),
##   month.number = col_double(),
##   births = col_double(),
##   time = col_double(),
##   location.type = col_character(),
##   moving.avg = col_double(),
##   percent.dev.from.mean = col_double(),
##   significant.seasonality = col_logical(),
##   seasonal.power = col_double(),
##   gamm.r2 = col_double(),
##   gamm.prediction = col_double(),
##   seasonal.strength = col_character(),
##   peak.month = col_double()
## )
BR_birth$country = "Brazil"
BR_birth$date = str_c(BR_birth$year,"-",BR_birth$month.number,"-01") %>% as.Date()
BR_birth$reliability = "Final figure"
BR_birth$source = "Micaela" # XXX To be changed to the official source

save(BR_birth, file = paste0(IO$out_Rdata,"BR_birth_data.Rdata"))

2.1.3 Collating the data together

brazil.dict.list = list("North" = c("Acre", "Amapá", "Amazonas", "Pará", "Rondônia", "Roraima", "Tocantins"),
                   "Northeast" = c("Alagoas", "Bahia", "Ceará", "Maranhão", "Paraíba", "Pernambuco", "Piauí", "Rio Grande do Norte", "Sergipe"),
                   "Central-West" = c("Goiás", "Mato Grosso", "Mato Grosso do Sul", "Distrito Federal" ),
                   "Southeast" = c("Espírito Santo", "Minas Gerais", "Rio de Janeiro", "São Paulo"),
                   "South" = c("Paraná", "Rio Grande do Sul", "Santa Catarina"))

brazil.dict = data.frame(region = rep(names(brazil.dict.list), lengths(brazil.dict.list)), states = unlist(brazil.dict.list))


UN_birth = UN_birth %>%  dplyr::mutate(
  reliability = Reliability,
  country = Country.or.Area,
  area = "Whole country",
  births = Value,
  births_moving_avg = NA,
  births_percent_dev_from_mean = NA
)

BR_birth = BR_birth %>% dplyr::mutate(
  area = brazil.dict$region[match(BR_birth$location, brazil.dict$states)],
  state = location,
  births_moving_avg = moving.avg,
  births_percent_dev_from_mean = percent.dev.from.mean
)

BR_birth_by_region = BR_birth %>% ddply(.,
                                        .(country, area, date, source, reliability),
                                        summarize,
                                        births = sum(births),
                                        births_moving_avg  = NA,
                                        births_percent_dev_from_mean = NA)


colnames = c("country","area","date","source","reliability","births","births_moving_avg","births_percent_dev_from_mean")

birth = rbind(
  UN_birth %>% dplyr::select(colnames),
  BR_birth_by_region %>% dplyr::select(colnames)
) 

save(birth, file = paste0(IO$out_Rdata,"birth_data.Rdata"))

2.2 Data preparation

2.2.1 Clue dataset

2.2.1.1 Data overview

User table:

users = read_feather(path = paste0(IO$input_clue,"users.feather"))
head(users)
## # A tibble: 6 x 7
##   user_id birth_year_bin country_area height_bin weight_bin
##   <chr>   <chr>          <chr>        <chr>      <chr>     
## 1 000320… 1970-1974      United Stat… 160-164    >=80      
## 2 bb78e5… 1975-1979      France       160-164    50-54     
## 3 9c1ae6… 1970-1974      France       165-169    70-74     
## 4 4ac787… 1970-1974      United Stat… 170-174    >=80      
## 5 1c9dfb… 1975-1979      Brazil - Ce… <NA>       <NA>      
## 6 33ae5a… 1970-1974      United King… 165-169    55-59     
## # … with 2 more variables: latest_birth_control <chr>, csv_file <chr>
dim(users)
## [1] 535154      7
cycles_00 = read_feather(path = paste0(IO$input_clue,"cycles/cycles0000.feather"))
head(cycles_00)
## # A tibble: 6 x 11
##   user_id cycle_nb cycle_start cycle_end  cycle_length period_start
##   <chr>      <int> <date>      <date>            <int> <date>      
## 1 155248…        7 2018-03-04  2018-03-28           25 2018-03-04  
## 2 b89b6d…        2 2018-12-24  2019-01-16           24 2018-12-24  
## 3 558762…        1 2018-02-10  2018-03-12           31 2018-02-10  
## 4 c2c8fd…        7 2018-05-30  2018-07-04           36 2018-05-30  
## 5 bfec0b…       23 2018-11-12  2018-12-14           33 2018-11-12  
## 6 aece4d…        1 2018-04-04  2018-04-29           26 2018-04-04  
## # … with 5 more variables: period_end <date>, period_length <int>,
## #   neg_preg_test <lgl>, pos_preg_test <lgl>, latest_preg_test <chr>
dim(cycles_00)
## [1] 287684     11
tracking_folder = paste0(IO$input_clue,"tracking/")
files = list.files(tracking_folder)

tracking = read_feather(path = paste0(tracking_folder,files[1]))
head(tracking)
## # A tibble: 6 x 5
##   user_id                           date       category type         number
##   <chr>                             <date>     <chr>    <chr>        <chr> 
## 1 8dab364e26c969e3f242d1c37158d4b9… 2018-09-11 pain     tender_brea… <NA>  
## 2 f7b870737c60dd97a6e4a5522c79500a… 2018-01-09 mental   calm         <NA>  
## 3 eedb725061a1eaba164713e2815b35ac… 2017-05-06 pill_hbc late         <NA>  
## 4 f56145c29cb39fa00b727e627be976b0… 2018-08-30 sleep    6-9          <NA>  
## 5 15c407da50db2c39ef01841474798079… 2018-12-06 pill_hbc taken        <NA>  
## 6 d2277465eeab05b1c2d7982c097c13c9… 2017-11-29 sleep    3-6          <NA>
unique(tracking$category)
##  [1] "pain"          "mental"        "pill_hbc"      "sleep"        
##  [5] "period"        "ANY"           "sex"           "emotion"      
##  [9] "poop"          "energy"        "social"        "ailment"      
## [13] "digestion"     "exercise"      "fluid"         "weight"       
## [17] "medication"    "iud"           "test"          "injection_hbc"
## [21] "bbt"           "patch_hbc"

2.2.1.2 Data preparation

Workflow:

  • Split Country and Area

  • Compute estimated mean BMI

  • Re-assemble the tracking tables into batches, making sure a single user has all its tracking in the same batch

  • Add the users features to the tracking table

  • Label user’s time-series

  • Birth control (as declared by users)

  • Birth control (re-assignment based on users’ declaration and on their tracking history - see Breast Tenderness paper method)

  • User’s tracking activity (convolution by X days from “ANY” tracking to reflect that the user was using the app)

2.2.1.2.1 Countries and Areas
countries_areas = users$country_area %>% str_split_fixed(., " - ", n = 2)
users$country = countries_areas[,1]
users$area = countries_areas[,2]

write_feather(users, path = paste0(IO$output_clue, "users.feather"))
ok = file.copy(from = paste0(IO$output_clue, "users.feather"), to = paste0(IO$tmp_clue, "users_with_countries.feather"))
2.2.1.2.2 BMI
users$height_bin = factor(users$height_bin, levels = dict$height$bin)
users$weight_bin = factor(users$weight_bin, levels = dict$weight$bin)
heigh_mean = dict$height$mean[match(users$height_bin,dict$height$bin)]
weight_mean = dict$weight$mean[match(users$weight_bin,dict$weight$bin)]
users$est_mean_bmi = weight_mean/((heigh_mean/100)^2)

write_feather(users, path = paste0(IO$output_clue, "users.feather"))
ok = file.copy(from = paste0(IO$output_clue, "users.feather"), to = paste0(IO$tmp_clue, "users_with_bmi.feather"))


ggplot(users, aes(x = est_mean_bmi, fill = country_area))+
  geom_histogram(binwidth = 2)+
  facet_grid(country_area ~ ., scale = "free_y")
## Warning: Removed 153517 rows containing non-finite values (stat_bin).

2.2.1.2.3 Batches and tracking tables augmentation
max_batch_size = 5000
n_batch = max(par$min_n_batches, ceiling(nrow(users)/max_batch_size))
batch_size = ceiling(nrow(users)/n_batch)

users$batch = rep(1:n_batch, each = batch_size)[1:nrow(users)]
write_feather(users, path = paste0(IO$output_clue, "users.feather"))
ok = file.copy(from = paste0(IO$output_clue, "users.feather"), to = paste0(IO$tmp_clue, "users_with_batches.feather"))
tracking_folder = paste0(IO$input_clue,"tracking/")
files = list.files(tracking_folder)

tmp_folder = paste0(IO$tmp_clue,"tracking_split_in_batches/")
if(!file.exists(tmp_folder)){
  dir.create(tmp_folder,recursive = TRUE)
  
  cl = makeCluster(par$n_cores)
  registerDoParallel(cl)
  
  tic()
  users_original_file_ids = foreach(file = files, .combine = rbind, .packages = c("feather","stringr")) %dopar%
  {
    tracking = read_feather(path = paste0(tracking_folder, file))
    dim(tracking)
    tracking$batch = users$batch[match(tracking$user_id, users$user_id)]
    if(nrow(tracking)>0){
      for(b in unique(tracking$batch)){
        tracking_batch = tracking[which(tracking$batch == b),]
        new_file_name = gsub(".feather",paste0("_batch_",b,".feather"),file)
        write_feather(tracking_batch, path = paste0(tmp_folder, new_file_name ))
      }
      users_original_file_ids = data.frame(user_id = unique(tracking$user_id), original_file_id = str_extract(file,"\\d{4}"))
    }else{  users_original_file_ids = data.frame(user_id = character(), original_file_id = character())}
    return(users_original_file_ids)
  }
  toc()
  stopImplicitCluster()
  
  users_o_f = aggregate(original_file_id ~ user_id ,users_original_file_ids, function(x)  paste0(sort(x), collapse = ","))
  colnames(users_o_f) = c("user_id","original_tracking_files")
  write_feather(users_o_f, path = paste0(IO$tmp_clue,"original_tracking_files_per_users.feather"))
}
## 453.054 sec elapsed
input_folder = paste0(IO$tmp_clue,"tracking_split_in_batches/")
output_folder = paste0(IO$output_clue,"tracking/")
if(file.exists(output_folder)){unlink(output_folder, recursive = TRUE)} ;dir.create(output_folder,recursive = TRUE)
tmp_folder = paste0(IO$tmp_clue,"tracking_in_batches/")
if(!dir.exists(tmp_folder)){
  dir.create(tmp_folder,recursive = TRUE)
  
  batches = unique(users$batch)
  ok = foreach(b = batches) %do% {
    cat("\t",b,"||\t")
    all_files = list.files(input_folder)
    files = all_files[grep(paste0("_batch_",b,"\\."),all_files)]
    
    cl = makeCluster(par$n_cores)
    registerDoParallel(cl)
    tracking = foreach(file  = files, .combine = rbind, .packages = "feather") %dopar%{tracking = read_feather(path = paste0(input_folder, file));return(tracking)}
    stopImplicitCluster()
    
    cols_to_add = c("birth_year_bin","country_area","height_bin","weight_bin", "est_mean_bmi")
    m = match(tracking$user_id, users$user_id)
    for(col_to_add in cols_to_add){
      eval(parse(text = paste0("tracking$",col_to_add," = users$",col_to_add,"[m]")))
    }
    o = order(tracking$user_id, tracking$date, tracking$category, tracking$type, tracking$number)
    tracking = tracking[o,]
    
    new_file_name = paste0("tracking_",b,".feather")
    write_feather(tracking, path = paste0(output_folder, new_file_name ))
    ok = file.copy(from = paste0(output_folder, new_file_name ), to = paste0(tmp_folder, new_file_name), overwrite = TRUE)
  }
}
##   1 ||        2 ||        3 ||        4 ||        5 ||        6 ||        7 ||        8 ||        9 ||        10 ||       11 ||       12 ||       13 ||       14 ||       15 ||       16 ||       17 ||       18 ||       19 ||       20 ||       21 ||       22 ||       23 ||       24 ||       25 ||       26 ||       27 ||       28 ||       29 ||       30 ||       31 ||       32 ||       33 ||       34 ||       35 ||       36 ||       37 ||       38 ||       39 ||       40 ||       41 ||       42 ||       43 ||       44 ||       45 ||       46 ||       47 ||       48 ||       49 ||       50 ||       51 ||       52 ||       53 ||       54 ||       55 ||       56 ||       57 ||       58 ||       59 ||       60 ||       61 ||       62 ||       63 ||       64 ||       65 ||       66 ||       67 ||       68 ||       69 ||       70 ||       71 ||       72 ||       73 ||       74 ||       75 ||       76 ||       77 ||       78 ||       79 ||       80 ||       81 ||       82 ||       83 ||       84 ||       85 ||       86 ||       87 ||       88 ||       89 ||       90 ||       91 ||       92 ||       93 ||       94 ||       95 ||       96 ||       97 ||       98 ||       99 ||       100 ||      101 ||      102 ||      103 ||      104 ||      105 ||      106 ||      107 ||      108 || 
2.2.1.2.4 Augmenting the users table
if(!file.exists(paste0(IO$tmp_clue, "users_agg.feather"))){
  
  tracking_folder = paste0(IO$output_clue,"tracking/")
  files = list.files(tracking_folder)
  
  cl = makeCluster(par$n_cores)
  registerDoParallel(cl)
  
  tic()
  users_agg = foreach(file = files, .combine = rbind, .packages = c("feather","stringr", "plyr")) %dopar%
  {
    tracking = read_feather(path = paste0(tracking_folder, file))
    users_agg = ddply(tracking,
                      .(user_id),
                      summarize,
                      first_obs = min(date),
                      last_obs = max(date),
                      n_obs_day = length(unique(date)),
                      n_obs = sum(category != "ANY"),
                      n_sex = sum(category == "sex"),
                      n_mucus =  sum(category == "fluid"),
                      n_temp = sum(category == "bbt")
    )
    return(users_agg)
  }
  toc()
  stopImplicitCluster()
  
  write_feather(users_agg, path = paste0(IO$tmp_clue, "users_agg.feather"))
  
}
## 2450.849 sec elapsed
users_agg = read_feather(path = paste0(IO$tmp_clue, "users_agg.feather"))

cols_to_add = setdiff(colnames(users_agg),"user_id")
m = match(users$user_id, users_agg$user_id)
for(col_to_add in cols_to_add){eval(parse(text = paste0("users$",col_to_add," = users_agg$",col_to_add,"[m]")))}
users$n_days = as.numeric(users$last_obs - users$first_obs)

write_feather(users, path = paste0(IO$output_clue, "users.feather"))
ok = file.copy(from = paste0(IO$output_clue, "users.feather"), to = paste0(IO$tmp_clue, "users_augmented.feather"))
2.2.1.2.5 Birth control

As declared by the users:

BC = read_feather(path = paste0(IO$input_clue,"birth_control.feather"))
o = order(BC$user_id, BC$date)
BC = BC[o,]
BC$birth_control = replace_na(BC$birth_control, "undefined")

write_feather(BC, path = paste0(IO$output_clue, "birth_control.feather"))
BC = read_feather(path = paste0(IO$output_clue,"birth_control.feather"))

input_folder = paste0(IO$tmp_clue,"tracking_in_batches/")
output_folder = paste0(IO$output_clue,"tracking/")
tmp_folder = paste0(IO$tmp_clue, "tracking_with_user_defined_birth_control/")
if(!dir.exists(tmp_folder)){
  
  dir.create(tmp_folder)
  
  files = list.files(input_folder)
  
  cl = makeCluster(par$n_cores)
  registerDoParallel(cl)
  
  tic()
  catch = 
    foreach(file = files, 
            .combine = rbind, 
            .packages = c("feather","stringr", "plyr","tidyverse")
    ) %dopar% {
      tracking = read_feather(path = paste0(input_folder, file))
      agg = aggregate(date ~ user_id, tracking, min)
      min_date = agg$date[match(tracking$user_id, agg$user_id)]
      tracking = tracking %>%  mutate(rel_date = as.numeric(date - min_date + 1))
      
      u_tracking = tracking %>% 
        select(user_id, date) %>%  unique() %>%  
        mutate(birth_control = NA,pill_type = NA, pill_regiment = NA)
      u_BC = BC %>% select(-csv_file) %>% filter(user_id %in% unique(tracking$user_id)) %>%  unique() 
      BC_exp = rbind(u_tracking, u_BC) %>%  arrange(., user_id, date)
      
      # need to expand the birth_control variables to replace the NAs with the latest value in the file
      BC_exp = BC_exp %>% group_by(user_id) %>% mutate(
        birth_control = replace_NAs_with_latest_value(birth_control) %>%  replace_na("undefined"),
        pill_type = replace_NAs_with_latest_value(pill_type),
        pill_regiment = replace_NAs_with_latest_value(pill_regiment)
      )
      
      tracking_key = str_c(tracking$user_id, "_",tracking$date)
      BC_key = str_c(BC_exp$user_id,"_",BC_exp$date)
      m = match(tracking_key,BC_key)
      tracking$birth_control_ud = BC_exp$birth_control[m]
      tracking$pill_type_ud = BC_exp$pill_type[m]
      tracking$pill_regiment_ud = BC_exp$pill_regiment[m]
      
      
      write_feather(tracking, path = paste0(output_folder,file))
      file.copy(from = paste0(output_folder,file), to = paste0(tmp_folder,file), overwrite = TRUE)
    }
  toc()
  stopImplicitCluster()
}
## 10996.129 sec elapsed
2.2.1.2.6 Users’ active tracking periods
active_tracking_filter = function(x, N = 28*1.5){
  res = stats::filter(c(rep(0,N-1),x), filter = rep(1,N), sides = 1) %>% pmin(1)
  res = na.omit(res) %>%  as.vector()
  return(res)
}
input_folder = paste0(IO$output_clue,"tracking/")
files = list.files(input_folder)

output_folder = paste0(IO$tmp_clue, "active_tracking/")
if(!dir.exists(output_folder)){
  
  dir.create(output_folder)
  
  
  time_vec = seq(min(users$first_obs), max(users$last_obs), by = 1)
  
  cl = makeCluster(par$n_cores)
  registerDoParallel(cl)
  tic()
  ok = 
    foreach(file = files, 
            .packages = c("dplyr", "feather")) %dopar% {
              tracking = read_feather(path = paste0(input_folder, file))
              
              any_tracking = tracking %>% filter(. , category == "ANY")  %>% 
                select(c(user_id, date, birth_control_ud))
              any_tracking$tracking_days = 1
              tmp = data.frame(
                user_id = rep(unique(tracking$user_id), each = length(time_vec)), 
                date = rep(time_vec, lu(tracking$user_id)))
              active_tracking = dplyr::full_join(tmp,any_tracking, by = c("user_id","date"))
              active_tracking$tracking_days[is.na(active_tracking$tracking_days)] = 0
              active_tracking$birth_control_ud = ave(active_tracking$birth_control_ud, 
                                                     by =  active_tracking$user_id, 
                                                     FUN = replace_NAs_with_latest_value)
              active_tracking = active_tracking %>%  group_by(user_id) %>%  
                mutate(., tracking = active_tracking_filter(tracking_days))
              active_tracking = active_tracking %>% select(user_id, date,birth_control_ud, tracking)
              
              # compressed table
              active_tracking  = active_tracking %>%  group_by(user_id, birth_control_ud) %>% 
                mutate(transition = diff(c(0, tracking)))
              active_tracking  = active_tracking %>%  group_by(user_id) %>% 
                mutate(stretch_num = cumsum(transition == 1))
              active_tracking  = active_tracking %>%  
                group_by(user_id, birth_control_ud, stretch_num, tracking) %>%  
                mutate(stretch_length = sum(tracking))
              compressed_tracking = active_tracking %>% filter(transition == 1) %>%  
                select(user_id, date,birth_control_ud, stretch_num, stretch_length) %>%  
                rename(start_date = date)
              
              file_name = paste0("active_",file)
              write_feather(compressed_tracking, path = paste0(output_folder,file_name))
            }
  toc()
  stopImplicitCluster()
  
}
## 28525.397 sec elapsed

2.3 Defining sexual behavior, fertility and health indicators

2.3.1 Clue dataset

users = read_feather(path = paste0(IO$output_clue, "users.feather"))

2.3.1.1 Population indicators

time_vec = seq(min(users$first_obs), max(users$last_obs), by = 1)
input_folder = paste0(IO$output_clue,"tracking/")
files = list.files(input_folder)

input_active_tracking = paste0(IO$tmp_clue,"active_tracking/")

indicators_folder = paste0(IO$output_clue, "pop_indicators/")
if(!dir.exists(indicators_folder)){dir.create(indicators_folder)}

if(!file.exists(paste0(indicators_folder, "sex_pop_indicators.feather"))){
  
  tic()
  tracking_pop_agg = foreach(file = files, .combine = rbind, .packages = c("dplyr","tidyverse")) %do% {
    cat("\t",file,"\t||")
    # tracking
    tracking = read_feather(path = paste0(input_folder, file))
    tracking$BC = dict$BC$type[match(tracking$birth_control_ud, dict$BC$birth_control)]
    tracking =  filter(tracking, BC %in% c("F","I"))
    
    # variables aggregates
    tracking_pop_agg = ddply(tracking,
                             .(date,country_area,BC),
                             summarise,
                             n_prot_sex = sum((category == "sex") & (type == "protected_sex"), na.rm = TRUE),
                             n_unprot_sex = sum((category == "sex") & (type == "unprotected_sex"), na.rm = TRUE),
                             n_wd_sex = sum((category == "sex") & (type == "withdrawal_sex"), na.rm = TRUE),
                             n_exercise = sum(category == "exercise", na.rm = TRUE),
                             n_long_sleep = sum((category == "sleep")&(type == ">9"), na.rm = TRUE),
                             n_bleeding = sum((category == "period")&(type != "spotting"), na.rm = TRUE),
                             n_medium_bleeding = sum((category == "period") & (type == "medium"), na.rm = TRUE),
                             n_breast_pain = sum((category == "pain") & (type == "tender_breasts"), na.rm = TRUE),
                             n_pill_taken = sum((category == "pill_hbc") & (type == "taken"), na.rm = TRUE)
    )
    
    tracking_pop_agg = tracking_pop_agg %>%  mutate(n_sex = n_prot_sex + n_unprot_sex + n_wd_sex)
    
    # active tracking
    active_tracking_compressed = read_feather(path = paste0(input_active_tracking,"active_",file))
    active_tracking = expand_compressed_tracking(active_tracking_compressed)
    active_tracking$BC = dict$BC$type[match(active_tracking$birth_control_ud, dict$BC$birth_control)]
    active_tracking =  filter(active_tracking, BC %in% c("F","I"))
    
    # total number of users
    active_tracking$country_area = tracking$country_area[match(active_tracking$user_id, tracking$user_id)]
    active_tracking_agg = ddply(active_tracking,
                                .(date,country_area,BC),
                                summarise,
                                n_users = sum(tracking, na.rm = TRUE)
    )
    
    
    
    tmp = dplyr::full_join(x = active_tracking_agg , y = tracking_pop_agg, by = c("date","country_area","BC")) %>%  
      arrange(country_area, BC, date) %>% 
      replace_na(list(n_prot_sex = 0,n_unprot_sex = 0, n_wd_sex= 0, n_sex = 0, 
                      n_party = 0, n_bleeding = 0, n_medium_bleeding = 0, n_breast_pain = 0, n_pill_taken = 0))
    
    return(tmp)
  }
  toc()
  
  tic()
  # sum the results from all files
  tmp = tracking_pop_agg %>% group_by(date, country_area, BC) %>% 
    summarise_each(.,sum) %>%  arrange(country_area, BC, date)
  # expand to have one row per possible date
  tmp2 = expand.grid(date = time_vec, country_area = unique(tmp$country_area), BC = unique(tmp$BC))
  tmp3 = dplyr::full_join(tmp, tmp2, by = c("date","country_area","BC")) %>%  
    arrange(country_area, BC, date) %>%  
    replace_na(., list(n_users = 0,n_prot_sex = 0,n_unprot_sex = 0, n_wd_sex= 0, n_sex = 0, 
                       n_party = 0, n_bleeding = 0, n_medium_bleeding = 0, n_breast_pain = 0, n_pill_taken = 0))
  # final table
  tracking_pop_agg = tmp3
  
  # save the results
  write_feather(tracking_pop_agg, path = paste0(indicators_folder, "sex_pop_indicators.feather"))
  toc()
  
}
tracking_pop_agg = read_feather(path = paste0(indicators_folder, "sex_pop_indicators.feather"))

ggplot(tracking_pop_agg, aes(x = date, y = n_users, col = BC))+
  geom_line()+
  ggtitle("Total # of users per country and birth control")+
  facet_wrap(country_area ~ .)

ggplot(tracking_pop_agg, aes(x = date, y = n_sex/n_users, col = BC))+
  geom_line()+
  ggtitle("Relative sex freq. per country and birth control")+
  facet_wrap(country_area ~ .)
## Warning: Removed 491 rows containing missing values (geom_path).

ggplot(tracking_pop_agg %>%  filter(date > as.Date("2017-06-30")), aes(x = date, y = n_sex/n_users, col = BC))+
  geom_line()+
  ggtitle("Relative sex freq. per country and birth control")+
  facet_grid(country_area ~ BC)

ggplot(tracking_pop_agg %>%  filter(date > as.Date("2017-06-30")), aes(x = date, y = n_prot_sex/n_users, col = BC))+
  geom_line()+
  ggtitle("Relative PROTECTED sex freq. per country and birth control")+
  facet_grid(country_area ~ BC)

ggplot(tracking_pop_agg %>%  filter(date > as.Date("2017-06-30")), aes(x = date, y = n_unprot_sex/n_users, col = BC))+
  geom_line()+
  ggtitle("Relative UNPROTECTED sex freq. per country and birth control")+
  facet_grid(country_area ~ BC)

ggplot(tracking_pop_agg %>%  filter(date > as.Date("2017-06-30")), aes(x = date, y = n_exercise/n_users, col = BC))+
  geom_line()+
  ggtitle("Relative reported PHYSICAL EXERCISE freq. per country and birth control")+
  facet_grid(country_area ~ BC)

ggplot(tracking_pop_agg %>%  filter(date > as.Date("2017-06-30")), aes(x = date, y = n_long_sleep/n_users, col = BC))+
  geom_line()+
  ggtitle("Relative reported SLEEP duration >9h freq. per country and birth control")+
  facet_grid(country_area ~ BC)

ggplot(tracking_pop_agg %>%  filter(date > as.Date("2017-06-30")), aes(x = date, y = n_medium_bleeding/n_users, col = BC))+
  geom_line()+
  ggtitle("Relative reported MEDIUM BLEEDING freq. per country and birth control")+
  facet_grid(country_area ~ BC)

ggplot(tracking_pop_agg %>%  filter(date > as.Date("2017-06-30")), aes(x = date, y = n_breast_pain/n_users, col = BC))+
  geom_line()+
  ggtitle("Relative reported BREAST PAIN freq. per country and birth control")+
  facet_grid(country_area ~ BC)

2.4 Detecting seasonal patterns in the individual and population indicators

2.4.1 Clue dataset

2.4.1.1 Is the dataset representative of the general population?

While the dataset hold a large number of user for each country/area, it may still not be representative of the population. One way to check that is to see if we can capture country-specific holiday in the remainder of the sexual behavior.

To test that, we will consider all sexual intercourse (protected and unprotected) and discard birth-control differences.

indicators_folder = paste0(IO$output_clue, "pop_indicators/")
tracking_pop_agg = read_feather(path = paste0(indicators_folder, "sex_pop_indicators.feather"))
tt = tracking_pop_agg %>% 
  dplyr::select(.,-BC) %>%  group_by(date, country_area) %>%  summarize_each(sum) %>%  arrange(country_area, date) %>% 
  mutate(r_sex = n_sex/n_users,
         r_exercise = n_exercise/n_users,
         r_long_sleep = n_long_sleep/n_users,
         r_medium_bleeding = n_medium_bleeding/n_users,
         r_breast_pain = n_breast_pain/n_users
  )

ggplot(tt, aes(x = date, y = r_sex, col = country_area)) +
  geom_line()+ facet_grid(country_area~., scale = "free_y")
## Warning: Removed 78 rows containing missing values (geom_path).

ggplot(tt %>% filter(date > as.Date("2016-06-30")), aes(x = date, y = r_sex, col = country_area)) +
  geom_line()+ facet_grid(country_area~., scale = "free_y")

ggplot(tt %>% filter(date > as.Date("2017-06-30")), aes(x = date, y = r_sex, col = country_area)) +
  geom_line()+ facet_grid(country_area~., scale = "free_y")

ggplot(tt, aes(x = date, y = r_exercise, col = country_area)) +
  geom_line()+ facet_grid(country_area~., scale = "free_y") + ggtitle("Physical exercise")
## Warning: Removed 3227 rows containing missing values (geom_path).

ggplot(tt, aes(x = date, y = r_long_sleep, col = country_area)) +
  geom_line()+ facet_grid(country_area~., scale = "free_y") + ggtitle("Slept for more than 9h")
## Warning: Removed 3227 rows containing missing values (geom_path).

ggplot(tt, aes(x = date, y = r_breast_pain, col = country_area)) +
  geom_line()+ facet_grid(country_area~., scale = "free_y") + ggtitle("Reported preast pain")
## Warning: Removed 78 rows containing missing values (geom_path).

ggplot(tt, aes(x = date, y = r_medium_bleeding, col = country_area)) +
  geom_line()+ facet_grid(country_area~., scale = "free_y") + ggtitle("Reported medium bleeding")
## Warning: Removed 78 rows containing missing values (geom_path).

tt = tt %>% filter(date > as.Date("2016-06-30"))

ggplot(tt, aes(x = date, y = r_sex, col = country_area)) +
  geom_line()+ facet_grid(country_area~., scale = "free_y")

seasonal_decomposition_daily_signal = function(x){
  x_o = x
  j = max(0,which(is.na(x_o)))
  if(j > (length(x_o) - 365)){ # if less than a year of data, we just don't do it
    res = data.frame(x = NA, trend = NA, weekly_pattern = NA, seasonal = NA, remainder = NA)[rep(1,length(x_o)),]
    return(res)
  }
  x = x_o[(j+1):length(x_o)]
  res = data.frame(x = x)
  
  # overall trend
  if(length(x)>= 3*365){ # using stl
    x_ts = ts(x, frequency = 365)
    x_stl = stl(x_ts, s.window = "periodic")
    res$trend = x_stl$time.series[,2] %>%  as.numeric()
    tmp_rem = apply(x_stl$time.series[,c(1,3)],1,sum)
  }else{ # using loess
    t = 1:length(x)
    res$trend = predict(loess(x ~ t, span = 1)) %>%  as.numeric()
    tmp_rem = x - res$trend
  }
  
  # weekly oscillations
  x_ts = ts(tmp_rem, frequency = 7)
  x_stl = stl(x_ts, s.window = "periodic")
  res$weekly_pattern = x_stl$time.series[,1]%>%  as.numeric()
  
  # seasonal trend
  if(length(x)>2*365){
    x_ts = ts(apply(x_stl$time.series[,2:3],1,sum), frequency = 365)
    x_stl = stl(x_ts, s.window = "periodic")
    res$seasonal =  x_stl$time.series[,1]%>%  as.numeric()
  }else{
    res$seasonal = 0
  }
  
  # remainder
  res$remainder = res$x - res$trend - res$weekly_pattern - res$seasonal
  
  if(j>0){
    res_NAs = NA*res[rep(1,j),]
    res = rbind(res_NAs, res)
  }
  
  return(res)
}
sdds = foreach(country = unique(tt$country_area),
               # c("France","Germany","Brazil - Central-West","United States - California","United States - Northeast"),
               # unique(tt$country_area),  
               .combine = rbind) %do%{ # 
                 j = which(tt$country_area == country)
                 x = tt$r_sex[j]
                 res = seasonal_decomposition_daily_signal(x)
                 colnames(res) = paste0("r_sex_",colnames(res))
                 res$country_area = country
                 res$date = tt$date[j]
                 return(res)
               }

tmp = dplyr::full_join(x = tt, y = sdds %>% dplyr::select(-r_sex_x), by = c("country_area","date"))
tt = tmp
tt$r_sex_country_typical = tt$r_sex - tt$r_sex_trend - tt$r_sex_weekly_pattern

countries_areas = tt$country_area %>% str_split_fixed(.," - ",n = 2)
tt$country = countries_areas[,1]
tt$area = countries_areas[,2]

write_feather(tt, path =paste0(IO$output_clue,"pop_indicators/relative_sexual_indicators_overall_by_country.feather"))
tt = read_feather(path =paste0(IO$output_clue,"pop_indicators/relative_sexual_indicators_overall_by_country.feather"))
selected_holidays  = holidays %>%  filter(country != "Switzerland", country != "Germany")

ggplot(tt, aes(x = date, y = r_sex_country_typical, col = area))+
  geom_vline(data = holidays, aes(xintercept = date), linetype = 1, col = "gold")+
  geom_line()+
  ggtitle("Total # of sex; overal + weekly trends removed")+
  scale_color_manual(values = c(rep("gray40",3),"coral1","slategray1"))+
  facet_grid(country ~ ., scale = "free")

ggplot(tt %>%  filter(date >= as.Date("2017-07-01"), country != "Switzerland"), aes(x = date,y = r_sex_country_typical, col = area))+ # country_area
  geom_vline(data = selected_holidays, aes(xintercept = date), linetype = 1, col = "gold")+
  geom_text(data = selected_holidays, aes(x = date, y = Inf, label = date_str), hjust = 0, nudge_x = 1, vjust = 1, nudge_y = 0, col = "gold3", size = 3, check_overlap = TRUE)+
  geom_line()+
  ggtitle("Total # of sex; overal + weekly trends removed")+
  facet_grid(country ~ ., scale = "free")+
  scale_color_manual(values = c(rep("gray40",3),"coral1","slategray1"))+
  xlim(c(as.Date("2017-06-30"),as.Date("2019-07-01")))+
  theme(legend.position = "bottom")
## Warning: Removed 75 rows containing missing values (geom_text).

ggplot(tt %>%  filter(date >= as.Date("2018-07-01"), country != "Switzerland"), aes(x = date, y = r_sex_weekly_pattern, col = area))+  
  geom_line()+
  ggtitle("Weekly trend")+  
  scale_color_manual(values = c(rep("gray40",3),"coral1","slategray1"))+
  facet_grid(country ~ .)+ # , scale = "free" +
  theme(legend.position = "bottom")

tt$weekday = wday(tt$date, week_start = 1, label = TRUE)
weekly_pattern = unique(tt %>%  dplyr::select(country_area, country, area, weekday, r_sex_weekly_pattern))

ggplot(weekly_pattern %>%  filter(country != "Switzerland"), aes(x = weekday, y = r_sex_weekly_pattern, col = country))+  
  geom_line(aes(group = country))+
  geom_point()+
  ggtitle("Weekly pattern per country")+  
  facet_grid(. ~ country_area) + # , scale = "free" +
  guides(col = "none")

2.4.1.1.1 Checking for reporting bias using other features
sddo = foreach(country = unique(tt$country_area),
               .combine = rbind) %do%{ # 
                 cat(country,"\n")
                 j = which(tt$country_area == country)
                 
                 x = tt$r_exercise[j]
                 res = seasonal_decomposition_daily_signal(x)
                 colnames(res) = paste0("r_exercise","_",colnames(res))
                 all_feat_res = res
                 
                 x = tt$r_long_sleep[j]
                 res = seasonal_decomposition_daily_signal(x)
                 colnames(res) = paste0("r_long_sleep","_",colnames(res))
                 all_feat_res = cbind(all_feat_res,res)
                 
                 x = tt$r_medium_bleeding[j]
                 res = seasonal_decomposition_daily_signal(x)
                 colnames(res) = paste0("r_medium_bleeding","_",colnames(res))
                 all_feat_res = cbind(all_feat_res,res)
                 
                 x = tt$r_breast_pain[j]
                 res = seasonal_decomposition_daily_signal(x)
                 colnames(res) = paste0("r_breast_pain","_",colnames(res))
                 all_feat_res = cbind(all_feat_res,res)
                 
                 
                 all_feat_res$country_area = country
                 all_feat_res$date = tt$date[j]
                 return(all_feat_res)
               }
## Brazil - Central-West 
## Brazil - Northeast 
## France 
## United Kingdom 
## United States - California 
## United States - Northeast
tmp = dplyr::full_join(x = tt, 
                       y = sddo %>% dplyr::select(-r_exercise_x, r_long_sleep_x, r_medium_bleeding_x, r_breast_pain_x), 
                       by = c("country_area","date"))



ttt = tmp
ttt$r_exercise_country_typical = ttt$r_exercise - ttt$r_exercise_trend - ttt$r_exercise_weekly_pattern
ttt$r_long_sleep_country_typical = ttt$r_long_sleep - ttt$r_long_sleep_trend - ttt$r_long_sleep_weekly_pattern
ttt$r_medium_bleeding_country_typical = ttt$r_medium_bleeding - ttt$r_medium_bleeding_trend - ttt$r_medium_bleeding_weekly_pattern
ttt$r_breast_pain_country_typical = ttt$r_breast_pain - ttt$r_breast_pain_trend - ttt$r_breast_pain_weekly_pattern



countries_areas = ttt$country_area %>% str_split_fixed(.," - ",n = 2)
ttt$country = countries_areas[,1]
ttt$area = countries_areas[,2]

write_feather(ttt, path =paste0(IO$output_clue,"pop_indicators/relative_other_features_indicators_overall_by_country.feather"))
ttt = read_feather(path =paste0(IO$output_clue,"pop_indicators/relative_other_features_indicators_overall_by_country.feather"))
selected_holidays  = holidays %>%  filter(country != "Switzerland", country != "Germany")

# ggplot(ttt, aes(x = date, y = r_exercise_country_typical, col = area))+
#   geom_vline(data = holidays, aes(xintercept = date), linetype = 1, col = "gold")+
#   geom_line()+
#   ggtitle("Exercise; overal + weekly trends removed")+
#   scale_color_manual(values = c(rep("gray40",3),"coral1","slategray1"))+
#   facet_grid(country ~ ., scale = "free")

ggplot(ttt %>%  filter(date >= as.Date("2017-07-01")), 
       aes(x = date,y = r_exercise_country_typical, col = area))+ # country_area
  geom_vline(data = selected_holidays, aes(xintercept = date), linetype = 1, col = "gold")+
  geom_text(data = selected_holidays, aes(x = date, y = Inf, label = date_str), 
            hjust = 0, nudge_x = 1, vjust = 1, nudge_y = 0, col = "gold3", size = 3, check_overlap = TRUE)+
  geom_line()+
  ggtitle("Exercise; overal + weekly trends removed")+
  facet_grid(country ~ ., scale = "free")+
  scale_color_manual(values = c(rep("gray40",3),"coral1","slategray1"))+
  xlim(c(as.Date("2017-06-30"),as.Date("2019-07-01")))+
  theme(legend.position = "bottom")
## Warning: Removed 75 rows containing missing values (geom_text).
## Warning: Removed 950 rows containing missing values (geom_path).

ggplot(ttt %>%  filter(date >= as.Date("2017-07-01")), 
       aes(x = date,y = r_long_sleep_country_typical, col = area))+ # country_area
  geom_vline(data = selected_holidays, aes(xintercept = date), linetype = 1, col = "gold")+
  geom_text(data = selected_holidays, aes(x = date, y = Inf, label = date_str), 
            hjust = 0, nudge_x = 1, vjust = 1, nudge_y = 0, col = "gold3", size = 3, check_overlap = TRUE)+
  geom_line()+
  ggtitle("Slept more than 9 hours; overal + weekly trends removed")+
  facet_grid(country ~ ., scale = "free")+
  scale_color_manual(values = c(rep("gray40",3),"coral1","slategray1"))+
  xlim(c(as.Date("2017-06-30"),as.Date("2019-07-01")))+
  theme(legend.position = "bottom")
## Warning: Removed 75 rows containing missing values (geom_text).

## Warning: Removed 950 rows containing missing values (geom_path).

ggplot(ttt %>%  filter(date >= as.Date("2017-07-01")), 
       aes(x = date,y = r_medium_bleeding_country_typical, col = area))+ # country_area
  geom_vline(data = selected_holidays, aes(xintercept = date), linetype = 1, col = "gold")+
  geom_text(data = selected_holidays, aes(x = date, y = Inf, label = date_str), 
            hjust = 0, nudge_x = 1, vjust = 1, nudge_y = 0, col = "gold3", size = 3, check_overlap = TRUE)+
  geom_line()+
  ggtitle("Medium Bleeding; overal + weekly trends removed")+
  facet_grid(country ~ ., scale = "free")+
  scale_color_manual(values = c(rep("gray40",3),"coral1","slategray1"))+
  xlim(c(as.Date("2017-06-30"),as.Date("2019-07-01")))+
  theme(legend.position = "bottom")
## Warning: Removed 75 rows containing missing values (geom_text).

ggplot(ttt %>%  filter(date >= as.Date("2017-07-01")), 
       aes(x = date,y = r_breast_pain_country_typical, col = area))+ # country_area
  geom_vline(data = selected_holidays, aes(xintercept = date), linetype = 1, col = "gold")+
  geom_text(data = selected_holidays, aes(x = date, y = Inf, label = date_str), 
            hjust = 0, nudge_x = 1, vjust = 1, nudge_y = 0, col = "gold3", size = 3, check_overlap = TRUE)+
  geom_line()+
  ggtitle("Breast Pain; overal + weekly trends removed")+
  facet_grid(country ~ ., scale = "free")+
  scale_color_manual(values = c(rep("gray40",3),"coral1","slategray1"))+
  xlim(c(as.Date("2017-06-30"),as.Date("2019-07-01")))+
  theme(legend.position = "bottom")
## Warning: Removed 75 rows containing missing values (geom_text).

ttt$weekday = wday(ttt$date, week_start = 1, label = TRUE)
weekly_pattern_other_features = unique(ttt %>%  dplyr::select(country_area, country, area, weekday, 
                                                              r_exercise_weekly_pattern,
                                                              r_long_sleep_weekly_pattern,
                                                              r_medium_bleeding_weekly_pattern,
                                                              r_breast_pain_weekly_pattern))

weekly_pattern_other_features_long = tidyr::pivot_longer(weekly_pattern_other_features, 
                                                         cols = c(r_exercise_weekly_pattern,
                                                                  r_long_sleep_weekly_pattern,
                                                                  r_medium_bleeding_weekly_pattern,
                                                                  r_breast_pain_weekly_pattern),
                                                         names_to = "feature")
weekly_pattern_other_features_long$feature = weekly_pattern_other_features_long$feature %>% 
  str_replace(.,"r_","") %>%
  str_replace(.,"_weekly_pattern","")

ggplot(weekly_pattern_other_features_long, 
       aes(x = weekday, y = value, col = country))+  
  geom_line(aes(group = country_area))+
  geom_point()+
  ggtitle("Weekly pattern per country")+  
  facet_grid(. ~ feature)  # , scale = "free" +
## Warning: Removed 84 rows containing missing values (geom_point).

#guides(col = "none")


weekly_pattern_sex_long = weekly_pattern %>% mutate(feature = "sex", value = r_sex_weekly_pattern) %>% dplyr::select(-r_sex_weekly_pattern)
weekly_pattern_all_features_long = rbind(weekly_pattern_other_features_long, weekly_pattern_sex_long)

ggplot(weekly_pattern_all_features_long, 
       aes(x = weekday, y = value, col = country))+  
  geom_line(aes(group = country_area))+
  geom_point()+
  ggtitle("Weekly pattern per country")+  
  facet_grid(. ~ feature)  
## Warning: Removed 84 rows containing missing values (geom_point).

2.4.1.1.2 Checking for synchronicity in bleeding (by country and BC)
sb = tracking_pop_agg %>% 
  dplyr::filter(date >= as.Date("2017-07-01")) %>% 
  mutate(r_bleeding = n_bleeding/n_users,
         cat = interaction(country_area,BC)) %>% 
  arrange(cat, date)

sdb = foreach(cat = unique(sb$cat),
              .combine = rbind) %do%{ # 
                cat(cat %>% as.character(),"\n")
                j = which(sb$cat == cat)
                
                x = sb$r_bleeding[j]
                res = seasonal_decomposition_daily_signal(x)
                colnames(res) = paste0("r_bleeding","_",colnames(res))
                
                res$cat = cat
                res$date = sb$date[j]
                return(res)
              }
## Brazil - Central-West.F 
## Brazil - Northeast.F 
## France.F 
## United Kingdom.F 
## United States - California.F 
## United States - Northeast.F 
## Brazil - Central-West.I 
## Brazil - Northeast.I 
## France.I 
## United Kingdom.I 
## United States - California.I 
## United States - Northeast.I
tmp = dplyr::full_join(x = sb, 
                       y = sdb %>% dplyr::select(-r_bleeding_x), 
                       by = c("cat","date"))

sb = tmp; rm(tmp, sdb)
ggplot(sb %>% dplyr::filter(date %in% seq(min(sb$date), by = 1, len = 21)), 
       aes(x = date, y = r_bleeding_weekly_pattern, col = BC))+
  geom_line()+geom_point()+
  ggtitle("Weekly pattern (from time-series decomposition)")+
  scale_x_date(date_labels = "%a %b %d %y")+
  facet_grid(country_area ~ .)

ggplot(sb, 
       aes(x = date, y = r_bleeding_remainder, col = BC))+
  geom_line()+
  ggtitle("Detrended bleeding pattern (overal trend and weekly trend removed)")+
  facet_grid(country_area ~ .)

2.4.1.2 Seasonality in montly aggregates

Workflow:

  • Construct a table that has a datapoint for each day; country; birth control category (pp)

  • Detrend the time-series per country and birth-control

  • Construct a table with the montly aggregates (pa)

all_BC =  tracking_pop_agg %>% dplyr::select(-BC) %>%  group_by(country_area, date) %>% 
  summarize_each(sum)

all_BC$BC = "all"
all_BC = all_BC %>%  dplyr::select(colnames(tracking_pop_agg))

pp = rbind(tracking_pop_agg %>%  as.data.frame(), all_BC %>%  as.data.frame()) 
pp = pp %>%  arrange(country_area, BC, date)
pp = pp %>%  filter(pp$date >= as.Date("2016-07-01"))

ggplot(pp, aes(x = date, y = n_users, col = BC))+
  geom_hline(yintercept = 1000)+
  geom_line()+
  ggtitle("total number of users (monthly)")+
  facet_wrap(country_area~.)

pp = pp %>%  mutate(
  r_sex = n_sex/n_users,
  r_prot = n_prot_sex/n_users,
  r_unprot = n_unprot_sex/n_users)
pp = pp %>%  filter(country_area != "Switzerland")

pp$cat = interaction(pp$country_area, pp$BC)

Seasonal decomposition on the Brazilian (Central West area) time-serie.

Tests to identifying the overal trend.

xx = pp$r_sex[pp$cat == "Brazil - Central-West.all"]
x = xx
head(x)
## [1] 0.10869565 0.02083333 0.12500000 0.08163265 0.08000000 0.10000000
length(x)
## [1] 1095
x_ts = ts(x, frequency = 365)
x_stl = stl(x_ts, s.window = "periodic")
plot(x_stl, main = "Overal trend on raw time-series")

# x_ts = ts(x, frequency = 7)
# x_stl = stl(x_ts, s.window = "periodic")
# plot(x_stl, main = "Weekly trend on raw time-series")
# 
# x_ts = ts(x, frequency = 365/2)
# x_stl = stl(x_ts, s.window = "periodic")
# plot(x_stl, main = "1/2 year trend on raw time-series")

Seasonal decomposition on the Brazilian (Central West area) time-serie.

x = xx
x_ts = ts(x, frequency = 365)
x_stl = stl(x_ts, s.window = "periodic")
plot(x_stl, main = "Overal trend on raw time-series")

res = data.frame(x = x, trend = x_stl$time.series[,2]  %>%  as.numeric() )

x = res$x - res$trend
x_ts = ts(x, frequency = 7)
x_stl = stl(x_ts, s.window = "periodic")
plot(x_stl, main = "Weekly pattern on the detrended time-series")

res$weekly = x_stl$time.series[,1] %>%  as.numeric()
x = res$x - res$trend - res$weekly 
x_ts = ts(x, frequency = 365)
x_stl = stl(x_ts, s.window = "periodic")
plot(x_stl, main = "Seasonal pattern on the detrended (overal and weekly) time-series")

Detrending time-series of all countries (except Switzerland)

detrend_daily_signal = function(x){
  res = data.frame(x = x)
  # overall trend
  x_ts = ts(x, frequency = 365)
  x_stl = stl(x_ts, s.window = "periodic")
  res$trend = x_stl$time.series[,2] %>%  as.numeric()
  res$detrended = x - res$trend
  #res$detrended_shifted = res$detrended + mean(res$trend)
  return(res)
}

detrended = foreach(category = unique(pp$cat), .combine = rbind) %do%{
  res = foreach(feature = c("r_sex","r_prot","r_unprot"), .combine = rbind) %do%{
    x = pp %>%  filter(cat == category) %>%  dplyr::select(feature) %>%  unlist() %>%  as.numeric() %>% replace_na(0)
    res = detrend_daily_signal(x)
    res = res %>%  mutate(feature = feature,
                          cat = category,
                          date = pp$date[which(pp$cat == category)])
  }
  return(res)
}
detrended_wide = detrended %>% 
  dplyr::select(date,cat,feature,trend,detrended) %>% 
  tidyr::pivot_wider(names_from = feature, values_from = c(trend, detrended))

tmp = dplyr::full_join(x = pp, y = detrended_wide, by = c("cat","date"))
pp = tmp


write_feather(pp, path =paste0(IO$output_clue,"pop_indicators/relative_sexual_indicators.feather"))



ggplot(pp, aes(x = date, y = n_sex, col = BC))+
  geom_line()+
  facet_grid(country_area ~ ., scale = "free")


ggplot(pp, aes(x = date, y = r_sex, col = BC))+
  geom_line()+
  facet_grid(country_area ~ ., scale = "free")

ggplot(pp, aes(x = date, y = detrended_r_sex, col = BC))+
  geom_line()+
  facet_grid(country_area ~ ., scale = "free")

ggplot(pp, aes(x = date, y = detrended_r_unprot, col = BC))+
  geom_line()+
  facet_grid(country_area ~ ., scale = "free")

ggplot(pp %>%  filter(month>=2017.5), aes(x = date, y = detrended_r_sex, col = BC))+
  geom_line()+
  facet_grid(country_area ~ BC, scale = "free")


ggplot(pp %>%  filter(month>=2017.5), aes(x = date, y = detrended_r_unprot, col = BC))+
  geom_line()+
  facet_grid(country_area ~ BC, scale = "free")
detrend_daily_signal = function(x){
  res = data.frame(x = x, i = 1:length(x))
  ll = loess(x ~ i, res, span = 365/length(x)) # yearly span
  res$trend = predict(ll)
  res$detrended = x - res$trend
  return(res)
}

pp_full = pp
pp = pp %>% filter(date >= as.Date("2017-07-01"))

detrended = foreach(category = unique(pp$cat), .combine = rbind) %do%{
  res = foreach(feature = c("r_sex","r_prot","r_unprot"), .combine = rbind) %do%{
    x = pp %>%  filter(cat == category) %>%  dplyr::select(feature) %>%  unlist() %>%  as.numeric() %>% replace_na(0)
    res = detrend_daily_signal(x)
    res = res %>%  mutate(feature = feature,
                          cat = category,
                          date = pp$date[which(pp$cat == category)])
  }
  return(res)
}
detrended_wide = detrended %>% 
  dplyr::select(date,cat,feature,trend,detrended) %>% 
  tidyr::pivot_wider(names_from = feature, values_from = c(trend, detrended))

tmp = dplyr::full_join(x = pp, y = detrended_wide, by = c("cat","date"))
pp = tmp

write_feather(pp, path =paste0(IO$output_clue,"pop_indicators/relative_sexual_indicators_detrended.feather"))



ggplot(pp, aes(x = date, y = n_sex, col = BC))+
  geom_line()+
  facet_grid(country_area ~ ., scale = "free")

ggplot(pp, aes(x = date, y = r_sex, col = BC))+
  geom_line()+
  facet_grid(country_area ~ ., scale = "free")

ggplot(pp, aes(x = date, y = detrended_r_sex, col = BC))+
  geom_line()+
  facet_grid(country_area ~ ., scale = "free")

ggplot(pp, aes(x = date, y = detrended_r_unprot, col = BC))+
  geom_line()+
  facet_grid(country_area ~ ., scale = "free")

pp$date_month = year(pp$date) + (month(pp$date)-1)/12
pp$month = month(pp$date)
pp$year = year(pp$date)

pa = pp %>% 
  dplyr::select(country_area, BC, date_month,year, month,detrended_r_sex, detrended_r_prot, detrended_r_unprot) %>%  
  group_by(country_area, BC, year, date_month,month) %>% 
  summarize_each(mean) %>% 
  mutate(year = floor(date_month),
         date = str_c(year,"-",month,"-01") %>% as.Date(), 
         country = str_split_fixed(country_area, " - ",2)[,1],
         area = str_split_fixed(country_area, " - ",2)[,2])

write_feather(pa, path =paste0(IO$output_clue,"pop_indicators/relative_sexual_indicators_detrended_monthly.feather"))


# pa = pa %>%  mutate(
#   r_sex = n_sex/n_users,
#   r_prot = n_prot_sex/n_users,
#   r_unprot = n_unprot_sex/n_users)

ggplot(pa, aes(x = date_month, y = detrended_r_sex, col = BC))+
  geom_hline(yintercept = 0, col = "gray50")+
  geom_line()+
  ggtitle("Relative sex (all) counts (monthly)")+
  facet_wrap(country_area ~ .)

ggplot(pa, aes(x = date_month, y = detrended_r_prot, col = BC))+
  geom_hline(yintercept = 0, col = "gray50")+
  geom_line()+
  ggtitle("Relative PROTECTED sex counts (monthly)")+
  facet_wrap(country_area ~ .)

ggplot(pa, aes(x = date_month, y = detrended_r_unprot, col = BC))+
  geom_hline(yintercept = 0, col = "gray50")+
  geom_line()+
  ggtitle("Relative UNPROTECTED sex counts (monthly)")+
  facet_wrap(country_area ~ .)

ggplot(pa, aes(x = month, y = detrended_r_sex, col = BC, group = interaction(BC, year)))+
  geom_hline(yintercept = 0, col = "gray50")+
  geom_line()+ scale_x_continuous(breaks = 1:12)+
  ggtitle("Relative sex (all) counts (by month)")+
  facet_wrap(country_area ~ .)

ggplot(pa, aes(x = month, y = detrended_r_prot, col = BC, group = interaction(BC, year)))+
  geom_hline(yintercept = 0, col = "gray50")+
  geom_line()+scale_x_continuous(breaks = 1:12)+
  ggtitle("Relative PROTECTED sex counts (by month)")+
  facet_wrap(country_area ~ .)

ggplot(pa, aes(x = month, y = detrended_r_unprot, col = BC, group = interaction(BC, year)))+
  geom_hline(yintercept = 0, col = "gray50")+
  geom_line()+scale_x_continuous(breaks = 1:12)+
  ggtitle("Relative UNPROTECTED sex counts (by month)")+
  facet_wrap(country_area ~ .)

ggplot(pa, aes(x = month, y = detrended_r_sex, col = BC, group = interaction(BC, year)))+
  geom_hline(yintercept = 0, col = "gray50")+
  geom_line()+ scale_x_continuous(breaks = 1:12)+
  ggtitle("Relative sex (all) counts (by month)")+
  facet_grid(country_area ~ BC)

ggplot(pa, aes(x = month, y = detrended_r_prot, col = BC, group = interaction(BC, year)))+
  geom_hline(yintercept = 0, col = "gray50")+
  geom_line()+scale_x_continuous(breaks = 1:12)+
  ggtitle("Relative PROTECTED sex counts (by month)")+
  facet_grid(country_area ~ BC)

ggplot(pa, aes(x = month, y = detrended_r_unprot, col = BC, group = interaction(BC, year)))+
  geom_hline(yintercept = 0, col = "gray50")+
  geom_line()+scale_x_continuous(breaks = 1:12)+
  ggtitle("Relative UNPROTECTED sex counts (by month)")+
  facet_grid(country_area ~ BC)

2.5 Birth model

3 Results

3.1 Datasets and user’s demographics

3.1.1 Clue

users = read_feather(path = paste0(IO$output_clue, "users.feather"))
users$country_area = factor(users$country_area, levels = names(sort(table(users$country_area))))
ggplot(users, aes(x = reorder(country_area,desc(country_area)), fill = country_area)) + coord_flip() + geom_bar() + guides(fill = FALSE) + 
  xlab("") + ylab("# of users") + ggtitle("# of users per country") 

ggplot(users, aes(x = birth_year_bin, fill = country_area)) + geom_bar() + facet_grid(country_area ~ ., scale = "free_y") + guides(fill = FALSE) + xlab("") + ylab("# of users") +
  ggtitle("Birth year (Age) distribution")

ggplot(users, aes(x = height_bin, fill = country_area)) + geom_bar() + facet_grid(country_area ~ ., scale = "free_y") + guides(fill = FALSE) + xlab("") + ylab("# of users") + 
  ggtitle("Users' height distribution")

ggplot(users, aes(x = weight_bin, fill = country_area)) + geom_bar() + facet_grid(country_area ~ ., scale = "free_y") + guides(fill = FALSE)+ xlab("") + ylab("# of users") + 
  ggtitle("Users' weight distribution")

ggplot(users, aes(x = est_mean_bmi, fill = country_area)) +  geom_histogram(binwidth = 2) + facet_grid(country_area ~ ., scale = "free_y") + guides(fill = FALSE)+ xlab("") + ylab("# of users") + 
  ggtitle("Users' approx. BMI distribution")
## Warning: Removed 153517 rows containing non-finite values (stat_bin).

users$latest_birth_control = factor(users$latest_birth_control, levels = names(sort(table(users$latest_birth_control))))
ggplot(users, aes(x = latest_birth_control, fill = country_area)) + coord_flip() + geom_bar()+ guides(fill = FALSE) +  xlab("") + ylab("# of users") + ggtitle("Distribution of Birth Control")

3.2 Do sexual intercourse patterns drive birth seasonal rhythms?

3.2.1 Clue

tt = read_feather(str_c(IO$output_clue,"pop_indicators/","relative_sexual_indicators_overall_by_country.feather"))
selected_holidays = holidays %>%  dplyr::filter(country %in% unique(tt$country))

load(str_c(IO$out_Rdata,"birth_data.Rdata"), verbose = TRUE)
## Loading objects:
##   birth
pa = read_feather(path = str_c(IO$output_clue,"pop_indicators/relative_sexual_indicators_detrended_monthly.feather"))
for(Country_Area in unique(tt$country_area)){
  
  Country = unique(tt$country[tt$country_area == Country_Area])
  Area = unique(tt$area[tt$country_area == Country_Area])
  
  
  # Birth data
  
  this_birth = birth %>% dplyr::filter(country == Country, (area == Area) | (area == "Whole country"))
  max_date = max(this_birth$date)
  if(month(max_date)<6){max_year = year(max_date)-1}else{max_year = year(max_date)}
  date_range = c(str_c(max_year - 2,"-07-01"),str_c(max_year,"-07-01")) %>%  as.Date
  
  this_birth_last_2_years = this_birth %>% dplyr::filter( date >= min(date_range), date < max(date_range))
  
  g_birth = ggplot(this_birth_last_2_years, aes(x = date, y = births, 
                                                col = area,
                                                group = interaction(source, area, reliability)))+
    geom_line()+
    scale_x_date(limits = date_range, expand = c(0,0))+
    ggtitle(str_c("Births - ",Country_Area))+
    facet_grid(area ~ ., scale = "free")
  g_birth
  
  # Shifted Birth data (= conceptions)
  
  this_conception_last_2_years = this_birth %>% mutate(date = date - months(8)) %>% 
    dplyr::filter( date >= min(date_range), date < max(date_range))
  
  g_conception = ggplot(this_conception_last_2_years, aes(x = date, y = births, 
                                                          col = area,
                                                          group = interaction(source, area, reliability)))+
    geom_line()+
    facet_grid(area ~ ., scale = "free")+
    ggtitle("Shifted births (conceptions)")+
    scale_x_date(limits = date_range, expand = c(0,0))
  g_conception
  
  
  # Monthly aggregate sexual patterns
  
  this_pa = pa %>% dplyr::filter(country_area == Country_Area, BC == "F")
  
  g_sex_monthly = ggplot(this_pa, aes(x =  date, y = detrended_r_unprot, col = area))+
    geom_hline(yintercept = 0, col = "gray80")+
    geom_line()+
    scale_x_date(limits = c(as.Date("2017-06-30"),as.Date("2019-07-01")),expand = c(0,0))+
    facet_grid(area ~ ., scale = "free")
  g_sex_monthly
  
  
  # Sexual patterns
  
  this_tt = tt %>%   dplyr::filter(country_area == Country_Area, date >= as.Date("2017-07-01"))
  this_holidays =  holidays %>%  dplyr::filter(country == Country)
  
  
  g_typical = ggplot(this_tt, aes(x = date, y = r_sex_country_typical, col = area))+ 
    geom_vline(data = this_holidays, aes(xintercept = date), linetype = 1, col = "gold")+
    geom_text(data = this_holidays, aes(x = date, y = Inf, label = date_str), 
              hjust = 0, nudge_x = 1, vjust = 1, nudge_y = 0, col = "gold3", size = 3, check_overlap = TRUE)+
    geom_line()+
    ggtitle("Total # of sex; overal + weekly trends removed")+
    ylab("Detrended sex")+
    facet_grid(area ~ ., scale = "free")+
    scale_color_manual(values = c(rep("gray40",1),"coral1","slategray1"))+
    scale_x_date(limits = c(as.Date("2017-06-30"),as.Date("2019-07-01")),expand = c(0,0))
  
  g_typical
  
 
  
  
  plotlist = list(g_birth, g_conception, g_sex_monthly, g_typical)
  
  g = plot_grid(plotlist = plotlist, ncol=1, align="v")
  print(g)
  
}
## Warning: Removed 22 rows containing missing values (geom_text).

## Warning: Removed 22 rows containing missing values (geom_text).

## Warning: Removed 22 rows containing missing values (geom_text).

## Warning: Removed 17 rows containing missing values (geom_text).

## Warning: Removed 14 rows containing missing values (geom_text).

## Warning: Removed 14 rows containing missing values (geom_text).